package cn.wanghaomiao.crawlers;

import java.util.List;

import org.apache.commons.lang.StringUtils;

import cn.wanghaomiao.dao.HBaseDao;
import cn.wanghaomiao.dao.UniversityDao;
import cn.wanghaomiao.model.University;
import cn.wanghaomiao.seimi.annotation.Crawler;
import cn.wanghaomiao.seimi.def.BaseSeimiCrawler;
import cn.wanghaomiao.seimi.struct.Request;
import cn.wanghaomiao.seimi.struct.Response;
import cn.wanghaomiao.xpath.model.JXDocument;

/**
 * 获取全国所有省份高校百度贴吧总目录地址
 * 
 * @author 胡宇鹏
 * 
 */
@Crawler(name = "U")
public class UniversityCrawler extends BaseSeimiCrawler {

	private final static String PREFIX = "http://tieba.baidu.com"; // 抓取链接的前缀

	@Override
	public String[] startUrls() {
		// 创建university表,学校名称：name,链接：url
		HBaseDao.createTable("university",
				new String[] { "univName", "univUrl" });

		System.out.println("开始抓取...");
		// 开始抓取的链接
		return new String[] { "http://tieba.baidu.com/f/fdir?fd=%B8%DF%B5%C8%D4%BA%D0%A3&sd=%D0%A3%D4%B0%C7%E0%B4%BA" };
	}

	@Override
	public void start(Response response) {
		System.out.println("获取html文本...");
		JXDocument doc = response.document(); // 获取链接返回的html文本
		try {
			// 获取html文本中所有的高校链接
			List<Object> urls = doc
					.sel("//div[@class='root_dir_box']/table/tbody/tr/td/ul/li/a/@href");
			for (Object s : urls) {
				// 判断链接是否存在
				if (StringUtils.isNotBlank(s.toString())) {
					push(Request.build(PREFIX + s.toString(), "universityBean"));
				}else{
					System.out.println("链接不存在.");
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public void universityBean(Response response) {
		try {
			University univ = response.render(University.class);
			System.out.println(univ.getUniv_url());
			UniversityDao.insert(univ); // 将数据存入hbase
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}
